/* create_VA_outcomes.do -******************************************************

	This file takes SIMS dataset and creates test score value-added measures.

******************************************************************************/
set more off

* Flag relevant SIMS records for creating VA outcomes
* Flag cohorts based on the year and grade a student first appears in the SIMS
* Also flag only BPS students

use "$stata_data_sims/all_sasid_full_build.dta", clear

gen simyear0 = simyear if t=="oct"
replace simyear0 = simyear-1 if t=="eoy"

replace grade = "-1" if grade=="PK"
replace grade = "0" if inlist(grade,"KF","KP","KT")
destring grade, replace force
replace grade = .a if missing(grade)

gen period = 0 if t=="oct"
replace period= 1 if t=="eoy"

* Create flags for Boston traditional schools
gen bps = district == "0035"
* These are all Boston charters
replace bps = 0 if inlist(full_school,"00350560","00350545","00350575")

bysort sasid (simyear0 period enstat -attend): gen sims_count = _n
keep if sims_count == 1

* Flag cohorts based on first year student appears in SIMS
gen app_cohort = simyear0 - (grade + 2) + 1
keep if inrange(app_cohort, 1997, 2003)

preserve
	keep sasid
	save "$stata_data_sims/all_sasid_full_build_fstgrade.dta", replace
restore

* Get list of BPS schools
keep if bps == 1
keep full_school
destring full_school, replace
duplicates drop
save "$stata_data_sims/BPS_schls_forVA.dta", replace

***************** Build SIMS dataset to run VA regressions **********************
use "$stata_data_sims/all_sasid_full_build.dta", clear

* Merge onto set of SASIDs for relevant cohorts
merge m:1 sasid using "$stata_data_sims/all_sasid_full_build_fstgrade.dta", keep(match)
drop _merge

*Create a variable that indicates the year a school year starts. If it's october
*the school year starts at the same year of observation. If it's eoy, we need
*to go back a year to determine year start.
gen simyear0 = simyear if t=="oct"
replace simyear0 = simyear-1 if t=="eoy"

*Create grade variables for prek and kindergarten
replace grade = "-1" if grade=="PK"
replace grade = "0" if inlist(grade,"KF","KP","KT")
destring grade, replace force
replace grade = .a if missing(grade)

*Indicate if the observation is from beginning of year or end of year
gen period = 0 if t=="oct"
replace period= 1 if t=="eoy"

*SIMS school codes in 2001-2002 are formatted differently (6 digits instead of 8)
*Use Stand and Deliver replication code to align with later years
gen temp_name = full_school if inlist(simyear0, 2001, 2002)
qui replace temp_name = substr(temp_name, -3, .)
destring temp_name, replace
destring org_code, replace
replace temp_name = org_code*10000+temp_name
tostring temp_name, replace
replace full_school = temp_name if inlist(simyear0, 2001, 2002)
drop temp_name org_code
destring full_school, replace

*Special Education Results
*spd_prim_dis spd_natr_svcs spd_lvl_need spd_placement
gen spd_lowest_need = spd_lvl_need == "01" if spd_lvl_need != ""
gen spd_low_need = spd_lvl_need == "02" if spd_lvl_need != ""
gen spd_mod_need = spd_lvl_need == "03" if spd_lvl_need != ""
gen spd_high_need = spd_lvl_need == "04" if spd_lvl_need != ""
gen spd_no_need = spd_lvl_need == "500" if spd_lvl_need != ""
gen any_spd = spd_no_need != 1 if spd_lvl_need != ""

* Since 2001 and 2002 SIMS files do not have SPED Level of Need variable, use SPED placement
* variable instead
replace spd_no_need = spd_placement == "00" if inlist(simyear0, 2001, 2002)
replace any_spd = spd_no_need != 1 if spd_placement != "" & inlist(simyear0, 2001, 2002)

*Track number absences and suspensions
gen num_absences = member - attend

*Free/Reduced lunch status
destring low_inc, replace
gen frpl = inlist(low_inc, 1, 2)

*Female flag
gen female = inlist(gender, "F", "f", "2")

*Race flag
destring(race), replace
gen raceFIX = .
replace raceFIX = 2 if race == 3 // Asian
replace raceFIX = 3 if race == 2 // Black
replace raceFIX = 5 if race == 1 // White
replace raceFIX = 4 if race == 4 // Native American
replace raceFIX = 1 if(race<=63 & race>=5) // Other race
replace raceFIX = 99 if inlist(race, 33, 34, 35, 36, 37) // Hispanic

replace race = raceFIX if simyear0 >= 2005
gen asian = race == 2
gen black = race == 3
gen white = race == 5
gen nativeam = race == 4
gen otherrace = race == 1
gen hispanic = race == 99

*Keep only students with enrolled status
keep if enstat == 1

*Create total suspensions
gen susp = in_susp + out_susp

*Collapse on studentno simgrade
destring lep, replace
sort sasid grade simyear0 period enstat -attend
collapse (firstnm) full_school school asian black white ///
	nativeam otherrace hispanic female ///
	(min) simyear0  ///
	(max) lep any_spd frpl ///
	(sum) num_absences susp, by(sasid grade)

*Merge MCAS scores
destring sasid, gen(sasid_no) force
merge m:1 sasid_no grade using "$stata_data_mcas/mcas_sasid_long.dta", keep(match)
drop _merge

*Create cubic function for scores
gen std_m_cu = std_m^3
gen std_e_cu = std_e^3

*Create time-series
drop if grade == 9 // no test scores in 9th grade
drop if grade == 5 & inlist(simyear0, 2003, 2004) // MCAS was not administered in 5th grade in these years

*Create lags for baseline cubic functions of test scores and suspensions/absences
sort sasid_no grade
by sasid_no: gen std_m_cu_lag = std_m_cu[_n-1]
by sasid_no: gen std_e_cu_lag = std_e_cu[_n-1]
by sasid_no: gen num_absences_lag = num_absences[_n-1]
by sasid_no: gen susp_lag = susp[_n-1]

*Save dataset
save "$stata_data_sims/SIMS_for_VA_regressions.dta", replace

* Create school enrollment flags that are grade-specific (so regressions run faster later)
use "$stata_data_sims/SIMS_for_VA_regressions.dta", clear

drop school
rename full_school schl

foreach gr in 3 4 5 6 7 8 10 {
preserve
	keep if grade == `gr'
	xi i.schl, noomit
	rename _Ischl_* schl_g`gr'_*
	save "$stata_data_sims/schlflag_`gr'.dta", replace
restore
}


**************************** Run VA regressions *********************************
set matsize 10000
foreach gr in 4 5 6 7 8 10 {

	use "$stata_data_sims/SIMS_for_VA_regressions.dta", clear

	keep if grade == `gr'
	merge 1:1 sasid_no using "$stata_data_sims/schlflag_`gr'.dta", keep(master match)
	display "`gr'"

	gen keep_flag_test = !mi(std_m) & !mi(female) & !mi(black)& !mi(white) & !mi(hispanic) & !mi(frpl) & !mi(lep) & !mi(any_spd) & !mi(std_m_cu_lag) & !mi(std_e_cu_lag) & !mi(num_absences_lag) & !mi(susp_lag) & !mi(simyear0)

	* test-score VAM
	olsvam std_m schl_g`gr'_* if keep_flag_test == 1, gen(VAMtest_gr`gr') covs(female black white hispanic frpl lep any_spd std_m_cu_lag std_e_cu_lag num_absences_lag susp_lag simyear0)

	save "$stata_data_sims/VA_reg_results_gr`gr'.dta", replace

	* Check that 0's in OLSVAM variable are because noone at the school had keep_flag == 1
	local vam_type test
	foreach v of local vam_type {
		bysort full_school: egen max_keep_flag = max(keep_flag_`v')
		count if VAM`v'_gr`gr' == 0 // Both counts should match if all 0's are explained by the keep_flag issue
		count if VAM`v'_gr`gr' == 0 & max_keep_flag == 0
		replace VAM`v'_gr`gr' = . if VAM`v'_gr`gr' == 0
		drop max_keep_flag
	}

	* Flag BPS schools
	drop _merge
	merge m:1 full_school using "$stata_data_sims/BPS_schls_forVA.dta", keep(master match)
	gen bps = 1 if _merge == 3
	drop _merge

	* For each grade, create a BPS average, weighted by number of students at school
	foreach v of local vam_type {
		rename VAM`v'_gr`gr' VAM`v'
		gen VAM`v'xfreq = VAM`v'*bps
		egen N=total(bps)
		egen sum = total(VAM`v'xfreq)

		gen wgt_bps_VAM`v'_avg = sum / N
		replace wgt_bps_VAM`v'_avg = round(wgt_bps_VAM`v'_avg, .00001)

		* Subtract BPS average from every VAM estimate
		gen VAM`v'_to_BPS = VAM`v' - wgt_bps_VAM`v'_avg

		drop N sum VAM`v'xfreq
	}

	keep full_school grade VAM* VAM*_to_BPS wgt_bps_VAM*_avg
	duplicates drop
	save "$stata_data_sims/VA_gr`gr'_bpsdemeaned.dta", replace

}

foreach gr in 4 5 6 7 8 10  {
	append using "$stata_data_sims/VA_gr`gr'_bpsdemeaned.dta"
	duplicates drop
}

* Save school-grade VAM estimates
duplicates drop
rename grade simgrade

save "$stata_data_sims/VAM_schl_gr_ests.dta", replace
